Load packages
require(pacman)
Caricamento del pacchetto richiesto: pacman
pacman::p_load(data.table, gdata, caTools, plotly, reshape2)
pacman::p_load_gh("luca-scr/smef")
# Return statistical mode of v
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
set.seed(100)
Load dataset
car_data = fread("./data/car_price_train.csv")
Dataset info
head(car_data)
dim(car_data)
[1] 8938 18
smef::describe(car_data)
str(car_data)
Classes ‘data.table’ and 'data.frame': 8938 obs. of 18 variables:
$ ID : int 2969 4640 3292 10199 10150 2334 5859 8830 820 1294 ...
$ Make : chr "Chevrolet" "Ford" "Nissan" "Toyota" ...
$ Model : chr "Corvette" "F-250" "Cube" "Tacoma" ...
$ Year : int 2015 1999 2013 2016 2013 2006 2014 2016 2012 2015 ...
$ Engine_Fuel_Type : chr "premium_unleaded" "regular_unleaded" "regular_unleaded" "regular_unleaded" ...
$ Engine_HP : int 460 220 122 278 150 180 265 449 220 335 ...
$ Engine_Cylinders : int 8 8 4 6 4 6 4 8 4 6 ...
$ Transmission_Type: chr "manual" "automatic" "automatic" "automatic" ...
$ Driven_Wheels : chr "rear_wheel_drive" "four_wheel_drive" "front_wheel_drive" "four_wheel_drive" ...
$ Number_of_Doors : int 2 2 4 4 4 4 4 4 4 4 ...
$ Market_Category : chr "High-Performance" "Unknown" "Unknown" "Unknown" ...
$ Vehicle_Size : chr "Compact" "Large" "Compact" "Compact" ...
$ Vehicle_Style : chr "Convertible" "Pickup" "Wagon" "Pickup" ...
$ Highway_MPG : int 29 15 31 23 33 24 25 26 29 30 ...
$ City_MPG : int 17 12 27 18 23 17 19 17 18 23 ...
$ Age : int 2 18 4 1 4 11 3 1 5 2 ...
$ Popularity : int 1385 5657 2009 2031 481 1851 640 617 376 3916 ...
$ MSRP : num 11.13 8.26 9.78 10.41 9.67 ...
- attr(*, ".internal.selfref")=<externalptr>
List of categorical features
cat_feat <- c('Make',
'Model',
"Engine_Fuel_Type",
'Transmission_Type',
'Driven_Wheels',
'Market_Category',
'Vehicle_Size',
'Vehicle_Style')
Unique values for categorical features
lapply(car_data[,cat_feat, with=F], unique)
MSRP distribution
hist(car_data[, MSRP], breaks = 50, main = "Histogram of MSRP", xlab = "MSRP")
Count Unknown\unknown values for each categorical feature
lapply(car_data[,cat_feat, with=F],
function(col) sum(isUnknown(col, unknown = c("Unknown","unknown"))))
$Make
[1] 0
$Model
[1] 0
$Engine_Fuel_Type
[1] 3
$Transmission_Type
[1] 15
$Driven_Wheels
[1] 0
$Market_Category
[1] 2817
$Vehicle_Size
[1] 0
$Vehicle_Style
[1] 0
Drop Market Category column (too many Unknown values)
car_data[,Market_Category:=NULL]
head(car_data)
Drop ID column
car_data[,ID:=NULL]
head(car_data)
Drop Year column
car_data[,Year:=NULL]
head(car_data)
Replace Unknown/unknown values with NA
car_data <- car_data[, lapply(.SD, function(x) replace(x, which(x=="Unknown"), NA))]
car_data <- car_data[, lapply(.SD, function(x) replace(x, which(x=="unknown"), NA))]
sum(is.na(car_data))
[1] 18
Replace NA values for Engine_Fuel_type and Transmission_type with respective mode value
fuel_transmission_mode = getmode(car_data[,c("Engine_Fuel_Type", "Transmission_Type"), with=F])
car_data <- replace_na(car_data, replace = c(fuel_transmission_mode))
sum(is.na(car_data))
Split dataset into training and validation set
split <- sample.split(car_data$MSRP, SplitRatio = 0.7)
train_set <- subset(car_data, split == "TRUE")
dim(train_set)
[1] 6256 15
valid_set <- subset(car_data, split == "FALSE")
dim(valid_set)
[1] 2682 15
Fit the model using the whole training set
rf_reg = randomForest(x = train_set[, .SD, .SDcols = !'MSRP'],
y = train_set[, MSRP],
ntree = 10,
nodesize = 1,
mtry = dim(train_set)[2] - 1,
importance = T)
rf_reg
Call:
randomForest(x = train_set[, .SD, .SDcols = !"MSRP"], y = train_set[, MSRP], ntree = 10, mtry = dim(train_set)[2] - 1, nodesize = 1, importance = T)
Type of random forest: regression
Number of trees: 10
No. of variables tried at each split: 14
Mean of squared residuals: 0.01928734
% Var explained: 98.41
Calculate RMSE on train set
y_pred = predict(rf_reg, newdata=train_set[, .SD, .SDcols = !'MSRP'])
rmse(train_set[, MSRP], y_pred)
[1] 0.06707272
Calculate RMSE on validation set
y_pred = predict(rf_reg, newdata = valid_set[, .SD, .SDcols = !'MSRP'])
rmse(valid_set[, MSRP], y_pred)
[1] 0.1328723
Visualize results
hist(valid_set[, MSRP],
breaks = 50,
col = rgb(0,0,1,1/4),
main = "Random Forest",
xlab = "MSRP")
hist(y_pred, breaks = 50, col=rgb(1,0,0,1/4), add = T)
box()
Feature Importance (normalized)
feat_imp = randomForest::importance(rf_reg, type=2)
feat_imp <- scale(feat_imp, center=F, scale=colSums(feat_imp))
feat_imp
IncNodePurity
Make 0.0088028123
Model 0.0040847728
Engine_Fuel_Type 0.0043276962
Engine_HP 0.1940287592
Engine_Cylinders 0.0101458630
Transmission_Type 0.0032721459
Driven_Wheels 0.0012693471
Number_of_Doors 0.0007883784
Vehicle_Size 0.0022554922
Vehicle_Style 0.0031662083
Highway_MPG 0.0045307032
City_MPG 0.0086011284
Age 0.7494937472
Popularity 0.0052329458
attr(,"scaled:scale")
IncNodePurity
7648.789
Top 10 Features
top_values = feat_imp[order(feat_imp[,1],decreasing=T),][1:10]
top_feat = rownames(feat_imp)[order(feat_imp[,1], decreasing=T)[1:10]]
barplot(height=top_values, names=top_feat, col="#69b3a2", las=2)
Select topmost numerical features as FEATURES of INTEREST (FOI)
foi = c('Age',
'Engine_HP',
'Engine_Cylinders',
'City_MPG',
'Highway_MPG',
'Popularity')
Prepare Dataset
train_set_foi = train_set[, append(foi, 'MSRP'), with=F]
head(train_set_foi)
valid_set_foi = valid_set[, append(foi, 'MSRP'), with=F]
head(valid_set_foi)
Fit the model
rf_reg_foi = randomForest(x = train_set_foi[, .SD, .SDcols = !'MSRP'],
y = train_set_foi[, MSRP],
ntree = 10,
nodesize = 1,
mtry = dim(train_set_foi)[2] - 1)
rf_reg_foi
Call:
randomForest(x = train_set_foi[, .SD, .SDcols = !"MSRP"], y = train_set_foi[, MSRP], ntree = 10, mtry = dim(train_set_foi)[2] - 1, nodesize = 1)
Type of random forest: regression
Number of trees: 10
No. of variables tried at each split: 6
Mean of squared residuals: 0.02395499
% Var explained: 98.02
Calculate RMSE on train set
y_pred = predict(rf_reg_foi, newdata = train_set_foi[, .SD, .SDcols = !'MSRP'])
rmse(train_set_foi[, MSRP], y_pred)
[1] 0.08181244
Calculate RMSE on validation set
y_pred = predict(rf_reg_foi, newdata = valid_set_foi[, .SD, .SDcols = !'MSRP'])
rmse(valid_set_foi[, MSRP], y_pred)
[1] 0.1430698
Visualize results
hist(valid_set[, MSRP],
breaks = 50,
col = rgb(0,0,1,1/4),
main = "Random Forest 2",
xlab = "MSRP")
hist(y_pred, breaks = 50, col=rgb(1,0,0,1/4), add = T)
box()
Fit the model …
rf_reg_2 = train(MSRP ~ .,
data = train_set_foi,
method = 'rf',
tuneGrid = expand.grid(mtry = 1:(dim(train_set_foi)[2] - 1)),
ntree = 10,
nodesize = 5,
trControl = trainControl(method = 'cv',
number = 10,
selectionFunction = "oneSE"))
rf_reg_2
Random Forest
6256 samples
6 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 5631, 5629, 5631, 5632, 5630, 5631, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
1 0.2165607 0.9648372 0.15040352
2 0.1528493 0.9805445 0.10170087
3 0.1465201 0.9821106 0.09644810
4 0.1459980 0.9821635 0.09700915
5 0.1455816 0.9823451 0.09656957
6 0.1482990 0.9816152 0.09781316
RMSE was used to select the optimal model using the one SE rule.
The final value used for the model was mtry = 3.
… or load trained forest
load("models/m_1_3_tuned_random_forest.RData")
rf_reg_2
Random Forest
6256 samples
6 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 5631, 5629, 5631, 5632, 5630, 5631, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
1 0.2165607 0.9648372 0.15040352
2 0.1528493 0.9805445 0.10170087
3 0.1465201 0.9821106 0.09644810
4 0.1459980 0.9821635 0.09700915
5 0.1455816 0.9823451 0.09656957
6 0.1482990 0.9816152 0.09781316
RMSE was used to select the optimal model using the one
SE rule.
The final value used for the model was mtry = 3.
Calculate RMSE on train set
y_pred = predict(rf_reg_2, newdata = train_set_foi[, .SD, .SDcols = !'MSRP'])
rmse(train_set_foi[, MSRP], y_pred)
[1] 0.08811463
Calculate RMSE on validation set
y_pred = predict(rf_reg_2, newdata = valid_set_foi[, .SD, .SDcols = !'MSRP'])
rmse(valid_set_foi[, MSRP], y_pred)
[1] 0.1357432
Visualize results
hist(valid_set[, MSRP],
breaks = 50,
col = rgb(0,0,1,1/4),
main = "Random Forest 3",
xlab = "MSRP")
hist(y_pred, breaks = 50, col=rgb(1,0,0,1/4), add = T)
box()
Scale dataset
X_train = train_set_foi[, .SD, .SDcols = !'MSRP']
Y_train = train_set_foi[, MSRP]
dataScaler = caret::preProcess(X_train, method = c("center", "scale"))
X_train_scaled = predict(dataScaler, X_train)
X_valid_scaled = predict(dataScaler, valid_set_foi)
smef::describe(X_train_scaled)
Obs Mean StdDev Min Median Max
Age 6256 0 1 -0.873 -0.610 2.677
Engine_HP 6256 0 1 -1.739 -0.207 6.982
Engine_Cylinders 6256 0 1 -3.182 0.210 5.864
City_MPG 6256 0 1 -1.353 -0.183 12.482
Highway_MPG 6256 0 1 -1.710 -0.183 9.925
Popularity 6256 0 1 -1.068 -0.114 2.835
smef::describe(X_valid_scaled)
Obs Mean StdDev Min Median Max
Age 2682 -0.0075 0.9822 -0.8728 -0.4784 2.6767
Engine_HP 2682 0.0000 1.0210 -1.8041 -0.2252 6.9820
Engine_Cylinders 2682 0.0038 1.0001 -3.1820 0.2102 5.8638
City_MPG 2682 -0.0078 0.8428 -1.2469 -0.1826 11.9502
Highway_MPG 2682 0.0184 1.1713 -1.7105 -0.0650 38.4863
Popularity 2682 0.0132 1.0150 -1.0682 -0.1136 2.8350
MSRP 2682 10.1225 1.1025 7.6009 10.3106 14.5411
Fit the model …
nn_reg = train(x = X_train_scaled,
y = Y_train,
method = "nnet",
tuneGrid = expand.grid(decay = c(0.01, 0.1, 1), size = 64),
linout = T,
maxit = 100,
trace = F,
trControl = trainControl(method = "cv",
number = 10,
selectionFunction = "oneSE"))
nn_reg
Neural Network
6256 samples
6 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 5631, 5631, 5631, 5631, 5630, 5629, ...
Resampling results across tuning parameters:
decay RMSE Rsquared MAE
0.01 0.2246159 0.9583626 0.1629258
0.10 0.2273319 0.9572557 0.1627390
1.00 0.2642568 0.9423997 0.1847960
Tuning parameter 'size' was held constant at a value of 64
RMSE was used to select the optimal model using the one SE rule.
The final values used for the model were size = 64 and decay = 0.1.
… or load trained network
load("models/m_2_neural_network.RData")
print(nn_reg)
Neural Network
6256 samples
6 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 5631, 5631, 5631, 5631, 5630, 5629, ...
Resampling results across tuning parameters:
decay RMSE Rsquared MAE
0.01 0.2246159 0.9583626 0.1629258
0.10 0.2273319 0.9572557 0.1627390
1.00 0.2642568 0.9423997 0.1847960
Tuning parameter 'size' was held constant at a value of 64
RMSE was used to select the optimal model using the one
SE rule.
The final values used for the model were size = 64 and decay
= 0.1.
Calculate RMSE on train set
y_pred = predict(nn_reg, newdata=X_train_scaled)
rmse(train_set_foi[, MSRP], y_pred)
[1] 0.2168384
Calculate RMSE on validation set
y_pred = predict(nn_reg, newdata = X_valid_scaled)
rmse(valid_set_foi[, MSRP], y_pred)
[1] 0.2244429
Visualize results
hist(valid_set[, MSRP],
breaks = 50,
col = rgb(0,0,1,1/4),
main = "Neural Network",
xlab = "MSRP")
hist(y_pred, breaks = 50, col=rgb(1,0,0,1/4), add = T)
box()
Plot Age, Engine_HP and MSRP
plot_ly(train_set, x = ~Age, y = ~Engine_HP, z = ~MSRP, size = 1)
No trace type specified:
Based on info supplied, a 'scatter3d' trace seems appropriate.
Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
No trace type specified:
Based on info supplied, a 'scatter3d' trace seems appropriate.
Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Set an Age threshold
age_ths = 17
Linear Regression for ‘YOUNG’ cars
young_train_set = train_set[train_set[, Age < age_ths]]
young_valid_set = valid_set[valid_set[, Age < age_ths]]
young_lin_reg = lm(MSRP ~ Age + Engine_HP + Age:Engine_HP,
data = young_train_set)
summary(young_lin_reg)
Call:
lm(formula = MSRP ~ Age + Engine_HP + Age:Engine_HP, data = young_train_set)
Residuals:
Min 1Q Median 3Q Max
-1.52031 -0.16691 -0.01932 0.14530 1.84610
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.466e+00 1.402e-02 675.14 <2e-16 ***
Age -5.094e-02 2.324e-03 -21.92 <2e-16 ***
Engine_HP 3.971e-03 4.862e-05 81.67 <2e-16 ***
Age:Engine_HP 2.002e-04 9.341e-06 21.43 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2834 on 5387 degrees of freedom
Multiple R-squared: 0.7681, Adjusted R-squared: 0.7679
F-statistic: 5946 on 3 and 5387 DF, p-value: < 2.2e-16
Calculate RMSE on training and validation set
y_pred = predict(young_lin_reg, newdata = young_train_set)
rmse(young_train_set[, MSRP], y_pred)
[1] 0.2833367
y_pred = predict(young_lin_reg, newdata = young_valid_set)
rmse(young_valid_set[, MSRP], y_pred)
[1] 0.2772316
Visualize results
hist(young_valid_set[, MSRP],
breaks = 50,
col = rgb(0,0,1,1/4),
main = "Linear Regression (Young Cars)",
xlab = "MSRP")
hist(y_pred, breaks = 50, col=rgb(1,0,0,1/4), add = T)
box()
Visualize model
graph_res <- 1
axis_x <- seq(0, age_ths, by = graph_res)
axis_y <- seq(min(young_train_set$Engine_HP), max(young_train_set$Engine_HP), by = graph_res)
reg_surface <- expand.grid(Age = axis_x, Engine_HP = axis_y, KEEP.OUT.ATTRS = F)
reg_surface$MSRP <- predict(young_lin_reg, newdata = reg_surface)
reg_surface <- acast(reg_surface, Engine_HP ~ Age, value.var = "MSRP")
young_plot <- plot_ly(young_train_set, x = ~Age, y = ~Engine_HP, z = ~MSRP, type = "scatter3d", size = 1, mode ="markers")
young_plot <- add_trace(young_plot, x = axis_x, y = axis_y, z = reg_surface, type = "surface")
young_plot
Avvertimento: 'surface' objects don't have these attributes: 'mode'
Valid attributes include:
'_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Avvertimento: 'surface' objects don't have these attributes: 'mode'
Valid attributes include:
'_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Linear Regression for ‘OLD’ cars
old_train_set = train_set[train_set[, Age >= age_ths]]
old_valid_set = valid_set[valid_set[, Age >= age_ths]]
old_lin_reg = lm(MSRP ~ Age + Age:Engine_HP + poly(Engine_HP, 2),
data = old_train_set)
summary(old_lin_reg)
Call:
lm(formula = MSRP ~ Age + Age:Engine_HP + poly(Engine_HP, 2),
data = old_train_set)
Residuals:
Min 1Q Median 3Q Max
-0.59785 -0.09773 -0.01678 0.02318 2.71053
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.383e+00 7.238e-02 115.814 < 2e-16 ***
Age 2.495e-02 1.081e-02 2.309 0.0212 *
poly(Engine_HP, 2)1 1.582e+01 2.264e+00 6.988 5.58e-12 ***
poly(Engine_HP, 2)2 1.685e+00 2.753e-01 6.120 1.42e-09 ***
Age:Engine_HP -3.222e-04 6.107e-05 -5.276 1.67e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.269 on 860 degrees of freedom
Multiple R-squared: 0.349, Adjusted R-squared: 0.346
F-statistic: 115.3 on 4 and 860 DF, p-value: < 2.2e-16
Calculate RMSE on training and validation set
y_pred = predict(old_lin_reg, newdata = old_train_set)
rmse(old_train_set[, MSRP], y_pred)
[1] 0.2681941
y_pred = predict(old_lin_reg, newdata = old_valid_set)
rmse(old_valid_set[, MSRP], y_pred)
[1] 0.1849445
Visualize results
hist(old_valid_set[, MSRP],
breaks = 20,
col = rgb(0,0,1,1/4),
main = "Linear Regression (Old Cars)",
xlab = "MSRP")
hist(y_pred, breaks = 20, col=rgb(1,0,0,1/4), add = T)
box()
Visualize model
graph_res <- 1
axis_x <- seq(age_ths, max(old_train_set$Age), by = graph_res)
axis_y <- seq(min(old_train_set$Engine_HP), max(old_train_set$Engine_HP), by = graph_res)
reg_surface <- expand.grid(Age = axis_x, Engine_HP = axis_y, KEEP.OUT.ATTRS = F)
reg_surface$MSRP <- predict(old_lin_reg, newdata = reg_surface)
reg_surface <- acast(reg_surface, Engine_HP ~ Age, value.var = "MSRP")
old_plot <- plot_ly(old_valid_set, x = ~Age, y = ~Engine_HP, z = ~MSRP, type = "scatter3d", size = 1, mode ="markers")
old_plot <- add_trace(old_plot, x = axis_x, y = axis_y, z = reg_surface, type = "surface")
old_plot
Avvertimento: 'surface' objects don't have these attributes: 'mode'
Valid attributes include:
'_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Avvertimento: 'surface' objects don't have these attributes: 'mode'
Valid attributes include:
'_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
#
Using only Engine_HP
old_lin_reg = lm(MSRP ~ poly(Engine_HP, 2),
data = old_train_set)
summary(old_lin_reg)
Call:
lm(formula = MSRP ~ poly(Engine_HP, 2), data = old_train_set)
Residuals:
Min 1Q Median 3Q Max
-0.95194 -0.10852 -0.03088 0.01355 2.76294
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.752768 0.009711 798.37 < 2e-16 ***
poly(Engine_HP, 2)1 4.751969 0.285600 16.64 < 2e-16 ***
poly(Engine_HP, 2)2 1.639302 0.285600 5.74 1.31e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2856 on 862 degrees of freedom
Multiple R-squared: 0.2644, Adjusted R-squared: 0.2627
F-statistic: 154.9 on 2 and 862 DF, p-value: < 2.2e-16
Calculate RMSE on training and validation set
y_pred = predict(old_lin_reg, newdata = old_train_set)
rmse(old_train_set[, MSRP], y_pred)
[1] 0.2851045
y_pred = predict(old_lin_reg, newdata = old_valid_set)
rmse(old_valid_set[, MSRP], y_pred)
[1] 0.2027388
Visualize model
ggplot(data = old_valid_set, aes(x = Engine_HP, y = MSRP)) + geom_point()
ggmatplot(x = old_valid_set$Engine_HP, y = y_pred, add = T, type = "l", col = "red")
Load test set
car_test = fread("./data/car_price_test.csv")
Check for missing values in Features of Interest
sum(is.na(car_test[,foi, with = F]))
[1] 0
MODEL 1.3 : Tuned Random Forest
y_pred_rf = predict(rf_reg_2, newdata=car_test[, foi, with=F])
Save as csv file
# write.csv(data.frame(ID = car_test$ID, Price = y_pred_rf),
# row.names = FALSE,
# file = "./results/Davide_Belfiori_submission1.csv")
MODEL 2 : Neural Network
car_test_scaled = predict(dataScaler, car_test[, foi, with=F])
y_pred_nn = predict(nn_reg, car_test_scaled)
Save as csv file
# write.csv(data.frame(ID = car_test$ID, Price = y_pred_nn),
# row.names = FALSE,
# file = "./results/Davide_Belfiori_submission2.csv")
MODEL 3 : Linear Regression
young_car_test = car_test[car_test[, Age < age_ths]]
y_pred_new = predict(young_lin_reg, newdata = young_car_test)
old_car_test = car_test[car_test[, Age >= age_ths]]
y_pred_old = predict(old_lin_reg, newdata = old_car_test)
Save as csv file
# write.csv(rbind(data.frame(ID = young_car_test$ID, Price = y_pred_new),
# data.frame(ID = old_car_test$ID, Price = y_pred_old)),
# row.names = FALSE,
# file = "./results/Davide_Belfiori_submission3.csv")
Compare results
hist(y_pred_rf,
breaks = 20,
col = rgb(0,0,1,1/4),
main = "Summary",
xlab = "MSRP")
hist(y_pred_nn, breaks = 20, col=rgb(1,0,0,1/4), add = T)
hist(append(y_pred_new, y_pred_old), breaks = 20, col=rgb(0,1,0,1/4), add = T)
box()